Load necessary packages.

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lattice)
library(RColorBrewer)
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.4
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Step 1 : Import the learning and text files

#read learning csv file
census = read.csv("census_income_learn.csv", header = FALSE)

#read txt file containing information about census data
#comments (lines starting with "|") are ignored
metaData = read.delim("/Users/kristinlee/Documents/census/census_income_metadata.txt", stringsAsFactors = FALSE, sep = "\n", fill = TRUE, comment = "|")

#get column names for dataset from metadata file
#add last column: income level (+/- 50000)
colNames = c(apply(metaData, 1, function(i) {
  gsub(":.*", "", i)
}), "income level")

#collapse names: change spaces to _
colNames_noSpace = gsub(" ", "_", colNames)

#rename columns of dataset
colnames(census) = colNames_noSpace

#get each variable type - in the metadata file, numeric are listed as "continuous" while the rest ("nominal") have the possible levels and will be coded here as factors
varDescriptions = apply(metaData, 1, function(i){
  gsub(".*:", "", i)
})

#get continuous variables by which ones have continuous in description
contVar = which(grepl("continuous", varDescriptions))

#what are the continuous variables?
colNames_noSpace[contVar]
## [1] "age"                             "wage_per_hour"                  
## [3] "capital_gains"                   "capital_losses"                 
## [5] "dividends_from_stocks"           "instance_weight"                
## [7] "num_persons_worked_for_employer" "weeks_worked_in_year"
#Note: metadata file says to ignore instance_weight in classifiers

#get rest as non-continuous variables
notContVar = c(1:length(varDescriptions))[-contVar]

#treat all non-continuous variables as factors
for(i in notContVar) {
  census[ ,i] = as.factor(census[ ,i])  
}

#double check this is correct
#get type of continuous variables
sapply(contVar, function(i) class(census[ , i]))
## [1] "integer" "integer" "integer" "integer" "integer" "numeric" "integer"
## [8] "integer"
#get type of non-continuous variables
sapply(notContVar, function(i) class(census[ , i]))
##  [1] "factor" "factor" "factor" "factor" "factor" "factor" "factor"
##  [8] "factor" "factor" "factor" "factor" "factor" "factor" "factor"
## [15] "factor" "factor" "factor" "factor" "factor" "factor" "factor"
## [22] "factor" "factor" "factor" "factor" "factor" "factor" "factor"
## [29] "factor" "factor" "factor" "factor" "factor"

Step 2: Based on the learning file, make a quick statistic based and univariate audit of the different columns’ content and produce the results in visual / graphic format. This audit should describe the variable distribution, the % of missing values, the extreme values, and so on.

#get structure of data frame
str(census)
## 'data.frame':    199523 obs. of  42 variables:
##  $ age                                       : int  73 58 18 9 10 48 42 28 47 34 ...
##  $ class_of_worker                           : Factor w/ 9 levels " Federal government",..: 4 7 4 4 4 5 5 5 2 5 ...
##  $ detailed_industry_recode                  : Factor w/ 52 levels "0","1","2","3",..: 1 5 1 1 1 41 35 5 44 5 ...
##  $ detailed_occupation_recode                : Factor w/ 47 levels "0","1","2","3",..: 1 35 1 1 1 11 4 41 27 38 ...
##  $ education                                 : Factor w/ 17 levels " 10th grade",..: 13 17 1 11 11 17 10 13 17 17 ...
##  $ wage_per_hour                             : int  0 0 0 0 0 1200 0 0 876 0 ...
##  $ enroll_in_edu_inst_last_wk                : Factor w/ 3 levels " College or university",..: 3 3 2 3 3 3 3 3 3 3 ...
##  $ marital_stat                              : Factor w/ 7 levels " Divorced"," Married-A F spouse present",..: 7 1 5 5 5 3 3 5 3 3 ...
##  $ major_industry_code                       : Factor w/ 24 levels " Agriculture",..: 15 5 15 15 15 7 8 5 6 5 ...
##  $ major_occupation_code                     : Factor w/ 15 levels " Adm support including clerical",..: 7 9 7 7 7 11 3 5 1 6 ...
##  $ race                                      : Factor w/ 5 levels " Amer Indian Aleut or Eskimo",..: 5 5 2 5 5 1 5 5 5 5 ...
##  $ hispanic_origin                           : Factor w/ 10 levels " All other"," Central or South American",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                                       : Factor w/ 2 levels " Female"," Male": 1 2 1 1 1 1 2 1 1 2 ...
##  $ member_of_a_labor_union                   : Factor w/ 3 levels " No"," Not in universe",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ reason_for_unemployment                   : Factor w/ 6 levels " Job leaver",..: 4 4 4 4 4 4 4 2 4 4 ...
##  $ full_or_part_time_employment_stat         : Factor w/ 8 levels " Children or Armed Forces",..: 3 1 3 1 1 2 1 7 2 1 ...
##  $ capital_gains                             : int  0 0 0 0 0 0 5178 0 0 0 ...
##  $ capital_losses                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dividends_from_stocks                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tax_filer_stat                            : Factor w/ 6 levels " Head of household",..: 5 1 5 5 5 3 3 6 3 3 ...
##  $ region_of_previous_residence              : Factor w/ 6 levels " Abroad"," Midwest",..: 4 5 4 4 4 4 4 4 4 4 ...
##  $ state_of_previous_residence               : Factor w/ 51 levels " ?"," Abroad",..: 37 6 37 37 37 37 37 37 37 37 ...
##  $ detailed_household_and_family_stat        : Factor w/ 38 levels " Child <18 ever marr not in subfamily",..: 30 21 8 3 3 37 21 36 37 21 ...
##  $ detailed_household_summary_in_household   : Factor w/ 8 levels " Child 18 or older",..: 7 5 1 3 3 8 5 6 8 5 ...
##  $ instance_weight                           : num  1700 1054 992 1758 1069 ...
##  $ migration_code-change_in_msa              : Factor w/ 10 levels " ?"," Abroad to MSA",..: 1 4 1 6 6 1 6 1 1 6 ...
##  $ migration_code-change_in_reg              : Factor w/ 9 levels " ?"," Abroad",..: 1 9 1 7 7 1 7 1 1 7 ...
##  $ migration_code-move_within_reg            : Factor w/ 10 levels " ?"," Abroad",..: 1 10 1 8 8 1 8 1 1 8 ...
##  $ live_in_this_house_1_year_ago             : Factor w/ 3 levels " No"," Not in universe under 1 year old",..: 2 1 2 3 3 2 3 2 2 3 ...
##  $ migration_prev_res_in_sunbelt             : Factor w/ 4 levels " ?"," No"," Not in universe",..: 1 4 1 3 3 1 3 1 1 3 ...
##  $ num_persons_worked_for_employer           : int  0 1 0 0 0 1 6 4 5 6 ...
##  $ family_members_under_18                   : Factor w/ 5 levels " Both parents present",..: 5 5 5 1 1 5 5 5 5 5 ...
##  $ country_of_birth_father                   : Factor w/ 43 levels " ?"," Cambodia",..: 41 41 42 41 41 32 41 41 41 41 ...
##  $ country_of_birth_mother                   : Factor w/ 43 levels " ?"," Cambodia",..: 41 41 42 41 41 41 41 41 41 41 ...
##  $ country_of_birth_self                     : Factor w/ 43 levels " ?"," Cambodia",..: 41 41 42 41 41 41 41 41 41 41 ...
##  $ citizenship                               : Factor w/ 5 levels " Foreign born- Not a citizen of U S ",..: 5 5 1 5 5 5 5 5 5 5 ...
##  $ own_business_or_self_employed             : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 3 1 1 1 1 ...
##  $ fill_inc_questionnaire_for_veteran's_admin: Factor w/ 3 levels " No"," Not in universe",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ veterans_benefits                         : Factor w/ 3 levels "0","1","2": 3 3 3 1 1 3 3 3 3 3 ...
##  $ weeks_worked_in_year                      : int  0 52 0 0 0 52 52 30 52 52 ...
##  $ year                                      : Factor w/ 2 levels "94","95": 2 1 2 1 1 2 1 2 2 1 ...
##  $ income_level                              : Factor w/ 2 levels " - 50000."," 50000+.": 1 1 1 1 1 1 1 1 1 1 ...
##make plots for each variable to get a sense of distribution

#set color palette to use
colPal = brewer.pal(4, "PuOr")
colPal_all = rep(colPal, ceiling(ncol(census)/4))

#make density plots for variables of type numeric or integer
for(i in contVar) {
  plot(density(census[ ,i]), main = paste("Distribution of", colNames[i]), col = colPal_all[i], cex.main = 0.7)
}

#make barplots for all factors
for(i in notContVar) {
  plot(census[ ,i], main = colNames[i], las = 2, col = colPal_all[i], ylab = "Number of entries", cex.axis = 0.7, cex.names = 0.6 )
}

##there are variables with "?" and "Not in universe"
##we'll treat this as missing data and set to NA to make it easier to work with
#note: this is a bit more complicated because we're dealing with factors but we can use recode_factor function from dplyr package
for(i in notContVar) {
  census[ ,i] = recode_factor(census[ ,i], " ?" = NA_character_)
  census[ ,i] = recode_factor(census[ ,i], " Not in universe" = NA_character_)
  census[ ,i] = recode_factor(census[ ,i], " Not in universe under 1 year old" = NA_character_)
  census[ ,i] = recode_factor(census[ ,i], " Not in universe or children" = NA_character_)
}

#recheck structure of data
str(census)
## 'data.frame':    199523 obs. of  42 variables:
##  $ age                                       : int  73 58 18 9 10 48 42 28 47 34 ...
##  $ class_of_worker                           : Factor w/ 8 levels " Federal government",..: NA 6 NA NA NA 4 4 4 2 4 ...
##  $ detailed_industry_recode                  : Factor w/ 52 levels "0","1","2","3",..: 1 5 1 1 1 41 35 5 44 5 ...
##  $ detailed_occupation_recode                : Factor w/ 47 levels "0","1","2","3",..: 1 35 1 1 1 11 4 41 27 38 ...
##  $ education                                 : Factor w/ 17 levels " 10th grade",..: 13 17 1 11 11 17 10 13 17 17 ...
##  $ wage_per_hour                             : int  0 0 0 0 0 1200 0 0 876 0 ...
##  $ enroll_in_edu_inst_last_wk                : Factor w/ 2 levels " College or university",..: NA NA 2 NA NA NA NA NA NA NA ...
##  $ marital_stat                              : Factor w/ 7 levels " Divorced"," Married-A F spouse present",..: 7 1 5 5 5 3 3 5 3 3 ...
##  $ major_industry_code                       : Factor w/ 23 levels " Agriculture",..: NA 5 NA NA NA 7 8 5 6 5 ...
##  $ major_occupation_code                     : Factor w/ 14 levels " Adm support including clerical",..: NA 8 NA NA NA 10 3 5 1 6 ...
##  $ race                                      : Factor w/ 5 levels " Amer Indian Aleut or Eskimo",..: 5 5 2 5 5 1 5 5 5 5 ...
##  $ hispanic_origin                           : Factor w/ 10 levels " All other"," Central or South American",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                                       : Factor w/ 2 levels " Female"," Male": 1 2 1 1 1 1 2 1 1 2 ...
##  $ member_of_a_labor_union                   : Factor w/ 2 levels " No"," Yes": NA NA NA NA NA 1 NA NA 1 NA ...
##  $ reason_for_unemployment                   : Factor w/ 5 levels " Job leaver",..: NA NA NA NA NA NA NA 2 NA NA ...
##  $ full_or_part_time_employment_stat         : Factor w/ 8 levels " Children or Armed Forces",..: 3 1 3 1 1 2 1 7 2 1 ...
##  $ capital_gains                             : int  0 0 0 0 0 0 5178 0 0 0 ...
##  $ capital_losses                            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dividends_from_stocks                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tax_filer_stat                            : Factor w/ 6 levels " Head of household",..: 5 1 5 5 5 3 3 6 3 3 ...
##  $ region_of_previous_residence              : Factor w/ 5 levels " Abroad"," Midwest",..: NA 4 NA NA NA NA NA NA NA NA ...
##  $ state_of_previous_residence               : Factor w/ 49 levels " Abroad"," Alabama",..: NA 5 NA NA NA NA NA NA NA NA ...
##  $ detailed_household_and_family_stat        : Factor w/ 38 levels " Child <18 ever marr not in subfamily",..: 30 21 8 3 3 37 21 36 37 21 ...
##  $ detailed_household_summary_in_household   : Factor w/ 8 levels " Child 18 or older",..: 7 5 1 3 3 8 5 6 8 5 ...
##  $ instance_weight                           : num  1700 1054 992 1758 1069 ...
##  $ migration_code-change_in_msa              : Factor w/ 8 levels " Abroad to MSA",..: NA 3 NA 5 5 NA 5 NA NA 5 ...
##  $ migration_code-change_in_reg              : Factor w/ 7 levels " Abroad"," Different county same state",..: NA 7 NA 6 6 NA 6 NA NA 6 ...
##  $ migration_code-move_within_reg            : Factor w/ 8 levels " Abroad"," Different county same state",..: NA 8 NA 7 7 NA 7 NA NA 7 ...
##  $ live_in_this_house_1_year_ago             : Factor w/ 2 levels " No"," Yes": NA 1 NA 2 2 NA 2 NA NA 2 ...
##  $ migration_prev_res_in_sunbelt             : Factor w/ 2 levels " No"," Yes": NA 2 NA NA NA NA NA NA NA NA ...
##  $ num_persons_worked_for_employer           : int  0 1 0 0 0 1 6 4 5 6 ...
##  $ family_members_under_18                   : Factor w/ 4 levels " Both parents present",..: NA NA NA 1 1 NA NA NA NA NA ...
##  $ country_of_birth_father                   : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 31 40 40 40 40 ...
##  $ country_of_birth_mother                   : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 40 40 40 40 40 ...
##  $ country_of_birth_self                     : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 40 40 40 40 40 ...
##  $ citizenship                               : Factor w/ 5 levels " Foreign born- Not a citizen of U S ",..: 5 5 1 5 5 5 5 5 5 5 ...
##  $ own_business_or_self_employed             : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 3 1 1 1 1 ...
##  $ fill_inc_questionnaire_for_veteran's_admin: Factor w/ 2 levels " No"," Yes": NA NA NA NA NA NA NA NA NA NA ...
##  $ veterans_benefits                         : Factor w/ 3 levels "0","1","2": 3 3 3 1 1 3 3 3 3 3 ...
##  $ weeks_worked_in_year                      : int  0 52 0 0 0 52 52 30 52 52 ...
##  $ year                                      : Factor w/ 2 levels "94","95": 2 1 2 1 1 2 1 2 2 1 ...
##  $ income_level                              : Factor w/ 2 levels " - 50000."," 50000+.": 1 1 1 1 1 1 1 1 1 1 ...
#rename income_level levels to under/over because current naming (- 50000/50000+) could be more intuitive and has some weird spacing
levels(census[ ,"income_level"])
## [1] " - 50000." " 50000+."
levels(census[ ,"income_level"]) = c("under", "over")

Step 3: Create a model using these variables (you can use whichever variables you want, or even create you own; for example, you could find the ratio or relationship between different variables, the one-hot encoding of “categorical” variables, etc.) to model wining more or less than $50,000 / year. Here, the idea would be for you to test one or two algorithms, such as logistic regression, or a decision tree. Feel free to choose others if wish.

## I want to explore the data further before choosing variables for model building.

#get how many entries (rows) have missing data
sum(apply(census, 1, function(i) sum(is.na(i)) > 0))
## [1] 199523
#every row has some amount of missing data, which is not surprising given that many entries were "Not in universe"
nrow(census)
## [1] 199523
#get number of entries with missing data for each variable
numMissingPerCol = apply(census, 2, function(i) sum(is.na(i)))

numMissingPerCol
##                                        age 
##                                          0 
##                            class_of_worker 
##                                     100245 
##                   detailed_industry_recode 
##                                          0 
##                 detailed_occupation_recode 
##                                          0 
##                                  education 
##                                          0 
##                              wage_per_hour 
##                                          0 
##                 enroll_in_edu_inst_last_wk 
##                                     186943 
##                               marital_stat 
##                                          0 
##                        major_industry_code 
##                                     100684 
##                      major_occupation_code 
##                                     100684 
##                                       race 
##                                          0 
##                            hispanic_origin 
##                                          0 
##                                        sex 
##                                          0 
##                    member_of_a_labor_union 
##                                     180459 
##                    reason_for_unemployment 
##                                     193453 
##          full_or_part_time_employment_stat 
##                                          0 
##                              capital_gains 
##                                          0 
##                             capital_losses 
##                                          0 
##                      dividends_from_stocks 
##                                          0 
##                             tax_filer_stat 
##                                          0 
##               region_of_previous_residence 
##                                     183750 
##                state_of_previous_residence 
##                                     184458 
##         detailed_household_and_family_stat 
##                                          0 
##    detailed_household_summary_in_household 
##                                          0 
##                            instance_weight 
##                                          0 
##               migration_code-change_in_msa 
##                                     101212 
##               migration_code-change_in_reg 
##                                     101212 
##             migration_code-move_within_reg 
##                                     101212 
##              live_in_this_house_1_year_ago 
##                                     101212 
##              migration_prev_res_in_sunbelt 
##                                     183750 
##            num_persons_worked_for_employer 
##                                          0 
##                    family_members_under_18 
##                                     144232 
##                    country_of_birth_father 
##                                       6713 
##                    country_of_birth_mother 
##                                       6119 
##                      country_of_birth_self 
##                                       3393 
##                                citizenship 
##                                          0 
##              own_business_or_self_employed 
##                                          0 
## fill_inc_questionnaire_for_veteran's_admin 
##                                     197539 
##                          veterans_benefits 
##                                          0 
##                       weeks_worked_in_year 
##                                          0 
##                                       year 
##                                          0 
##                               income_level 
##                                          0
#some columns are complete (like age, education, wage_per_hour) and these seem like good variables to use to build models

##for complete columns, create visualizations to see if there's patterns related to income_level
#get variables with no missing data
completeCol = which(numMissingPerCol == 0)

#remove income_level and instance_weight from this vector because income_level is what we want to predict and we should not include instance_weight as a predictor
completeCol = completeCol[-which(names(completeCol) == "income_level" | names(completeCol) == "instance_weight")]

##visualize distributions of data separated by income_level for each variable

#function to plot density of continuous variable by income_level
plotDensityByIncome = function(i) {
  densityplot(census[,i], groups=census[ ,"income_level"], xlab = colNames[i], plot.points=FALSE, auto.key=TRUE, main = list(label = paste(colNames[i], "by Income Level (under/over 50K)"), cex = 0.7))
}
#plot density for all continous variables with no missing data
for(i in intersect(contVar, completeCol)) {
  print(plotDensityByIncome(i)) #need print here because lattice plot is strange in for loops
}

#function to plot histograms of non-continuous variable by income_level
plotHistByIncome = function(i) {
  histogram(~census[ ,i] | census[ ,42], xlab = colNames[i], las = 1, layout = c(1, 2), scales = list(x = list(rot = 90, cex = 0.5)))
}
#plot histograms for all non-continous variables with no missing data
for(i in intersect(notContVar, completeCol)) {
  print(plotHistByIncome(i))
}

##based on the plots, I'm going to do two logistic regression with different predictors:

#model 1- predictors: age, sex, race, marital status, education, detailed industry recode, detailed occupation recode, weeks worked in year, and number persons worked for employer
model_1 = glm(income_level ~ age + sex + race + marital_stat + education + detailed_industry_recode + detailed_occupation_recode + weeks_worked_in_year + num_persons_worked_for_employer, family = binomial, na.action = na.omit, data = census) 

#get summary of model 1
summary(model_1)
## 
## Call:
## glm(formula = income_level ~ age + sex + race + marital_stat + 
##     education + detailed_industry_recode + detailed_occupation_recode + 
##     weeks_worked_in_year + num_persons_worked_for_employer, family = binomial, 
##     data = census, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3979  -0.2366  -0.0944  -0.0001   4.2306  
## 
## Coefficients: (2 not defined because of singularities)
##                                                    Estimate Std. Error
## (Intercept)                                      -8.804e+00  2.241e-01
## age                                               3.841e-02  1.043e-03
## sex Male                                          1.124e+00  2.875e-02
## race Asian or Pacific Islander                    2.269e-01  1.731e-01
## race Black                                        2.895e-02  1.679e-01
## race Other                                        1.745e-01  2.053e-01
## race White                                        3.306e-01  1.607e-01
## marital_stat Married-A F spouse present          -2.638e-01  2.852e-01
## marital_stat Married-civilian spouse present      5.891e-02  3.999e-02
## marital_stat Married-spouse absent               -5.449e-02  1.328e-01
## marital_stat Never married                       -5.532e-01  5.249e-02
## marital_stat Separated                           -5.334e-02  1.010e-01
## marital_stat Widowed                              1.882e-01  7.671e-02
## education 11th grade                              2.568e-01  1.806e-01
## education 12th grade no diploma                   5.400e-01  2.231e-01
## education 1st 2nd 3rd or 4th grade               -3.404e-01  3.121e-01
## education 5th or 6th grade                       -4.760e-01  2.543e-01
## education 7th and 8th grade                      -1.157e-01  1.788e-01
## education 9th grade                              -7.974e-02  2.127e-01
## education Associates degree-academic program      1.439e+00  1.442e-01
## education Associates degree-occup /vocational     1.271e+00  1.435e-01
## education Bachelors degree(BA AB BS)              2.132e+00  1.344e-01
## education Children                               -1.246e+01  7.989e+01
## education Doctorate degree(PhD EdD)               3.359e+00  1.526e-01
## education High school graduate                    8.517e-01  1.338e-01
## education Less than 1st grade                    -1.734e+00  1.011e+00
## education Masters degree(MA MS MEng MEd MSW MBA)  2.738e+00  1.372e-01
## education Prof school degree (MD DDS DVM LLB JD)  3.070e+00  1.535e-01
## education Some college but no degree              1.261e+00  1.346e-01
## detailed_industry_recode1                         7.791e-01  3.833e-01
## detailed_industry_recode2                         1.092e+00  4.196e-01
## detailed_industry_recode3                         1.985e+00  3.515e-01
## detailed_industry_recode4                         1.191e+00  3.379e-01
## detailed_industry_recode5                         8.418e-01  3.282e-01
## detailed_industry_recode6                         6.985e-01  3.961e-01
## detailed_industry_recode7                         1.227e+00  3.797e-01
## detailed_industry_recode8                         1.642e+00  3.595e-01
## detailed_industry_recode9                         1.385e+00  3.513e-01
## detailed_industry_recode10                       -1.490e+01  8.092e+03
## detailed_industry_recode11                        1.499e+00  3.404e-01
## detailed_industry_recode12                        1.562e+00  3.427e-01
## detailed_industry_recode13                        2.240e+00  3.462e-01
## detailed_industry_recode14                        1.475e+00  3.715e-01
## detailed_industry_recode15                        1.531e+00  3.588e-01
## detailed_industry_recode16                        1.474e+00  3.564e-01
## detailed_industry_recode17                        8.841e-01  4.969e-01
## detailed_industry_recode18                        1.108e+00  3.865e-01
## detailed_industry_recode19                        7.952e-01  3.553e-01
## detailed_industry_recode20                        3.170e+00  5.400e-01
## detailed_industry_recode21                        8.289e-01  4.004e-01
## detailed_industry_recode22                        9.396e-01  3.826e-01
## detailed_industry_recode23                        1.864e+00  3.593e-01
## detailed_industry_recode24                        1.214e+00  3.446e-01
## detailed_industry_recode25                        1.862e+00  3.429e-01
## detailed_industry_recode26                        2.655e+00  4.009e-01
## detailed_industry_recode27                        1.153e+00  3.693e-01
## detailed_industry_recode28                        1.127e+00  5.427e-01
## detailed_industry_recode29                        1.501e+00  3.379e-01
## detailed_industry_recode30                        1.658e+00  3.425e-01
## detailed_industry_recode31                        1.721e+00  3.421e-01
## detailed_industry_recode32                        1.214e+00  3.383e-01
## detailed_industry_recode33                        7.015e-01  3.364e-01
## detailed_industry_recode34                        1.500e+00  3.379e-01
## detailed_industry_recode35                        1.222e+00  3.377e-01
## detailed_industry_recode36                        7.317e-01  7.137e-01
## detailed_industry_recode37                        1.344e+00  3.370e-01
## detailed_industry_recode38                        9.304e-01  3.519e-01
## detailed_industry_recode39                        6.749e-01  3.510e-01
## detailed_industry_recode40                        9.864e-01  3.503e-01
## detailed_industry_recode41                        7.737e-01  3.401e-01
## detailed_industry_recode42                        8.385e-01  3.404e-01
## detailed_industry_recode43                        4.458e-01  3.378e-01
## detailed_industry_recode44                        1.127e-01  3.539e-01
## detailed_industry_recode45                        1.196e+00  3.355e-01
## detailed_industry_recode46                        8.023e-01  2.971e-01
## detailed_industry_recode47                        1.421e+00  3.455e-01
## detailed_industry_recode48                        4.613e-01  3.655e-01
## detailed_industry_recode49                        1.579e+00  3.506e-01
## detailed_industry_recode50                        1.196e+00  3.415e-01
## detailed_industry_recode51                        1.744e+00  4.789e-01
## detailed_occupation_recode1                      -7.592e-01  3.507e-01
## detailed_occupation_recode2                       3.610e-03  3.285e-01
## detailed_occupation_recode3                      -7.720e-01  3.315e-01
## detailed_occupation_recode4                      -2.879e-01  3.338e-01
## detailed_occupation_recode5                      -2.559e-01  3.380e-01
## detailed_occupation_recode6                      -9.352e-01  3.428e-01
## detailed_occupation_recode7                       5.723e-01  3.482e-01
## detailed_occupation_recode8                      -2.074e-01  3.377e-01
## detailed_occupation_recode9                      -7.643e-01  3.485e-01
## detailed_occupation_recode10                     -1.041e+00  3.379e-01
## detailed_occupation_recode11                      2.210e-01  3.479e-01
## detailed_occupation_recode12                     -1.161e+00  3.330e-01
## detailed_occupation_recode13                     -1.250e+00  3.567e-01
## detailed_occupation_recode14                     -1.277e+00  3.431e-01
## detailed_occupation_recode15                     -6.933e-01  3.422e-01
## detailed_occupation_recode16                     -2.665e-01  3.336e-01
## detailed_occupation_recode17                     -3.904e-01  3.354e-01
## detailed_occupation_recode18                     -4.583e-01  3.388e-01
## detailed_occupation_recode19                     -1.425e+00  3.429e-01
## detailed_occupation_recode20                     -9.973e-01  6.309e-01
## detailed_occupation_recode21                     -9.271e-01  3.542e-01
## detailed_occupation_recode22                     -1.480e+00  3.890e-01
## detailed_occupation_recode23                     -2.087e+00  3.548e-01
## detailed_occupation_recode24                     -1.972e+00  3.639e-01
## detailed_occupation_recode25                     -1.983e+00  3.670e-01
## detailed_occupation_recode26                     -2.234e+00  3.365e-01
## detailed_occupation_recode27                     -2.635e+00  1.006e+00
## detailed_occupation_recode28                     -9.853e-01  3.409e-01
## detailed_occupation_recode29                     -2.458e+00  3.814e-01
## detailed_occupation_recode30                     -1.804e+00  3.869e-01
## detailed_occupation_recode31                     -2.745e+00  3.801e-01
## detailed_occupation_recode32                     -1.665e+00  3.734e-01
## detailed_occupation_recode33                     -1.247e+00  3.333e-01
## detailed_occupation_recode34                     -1.172e+00  3.357e-01
## detailed_occupation_recode35                     -1.350e+00  3.337e-01
## detailed_occupation_recode36                     -1.891e+00  3.387e-01
## detailed_occupation_recode37                     -2.012e+00  3.474e-01
## detailed_occupation_recode38                     -1.727e+00  3.392e-01
## detailed_occupation_recode39                     -1.336e+00  3.461e-01
## detailed_occupation_recode40                     -1.550e+00  4.093e-01
## detailed_occupation_recode41                     -2.292e+00  3.835e-01
## detailed_occupation_recode42                     -2.380e+00  3.730e-01
## detailed_occupation_recode43                     -1.173e+00  4.238e-01
## detailed_occupation_recode44                     -1.938e+00  4.010e-01
## detailed_occupation_recode45                             NA         NA
## detailed_occupation_recode46                             NA         NA
## weeks_worked_in_year                              4.007e-02  1.345e-03
## num_persons_worked_for_employer                   1.418e-01  6.691e-03
##                                                  z value Pr(>|z|)    
## (Intercept)                                      -39.284  < 2e-16 ***
## age                                               36.818  < 2e-16 ***
## sex Male                                          39.090  < 2e-16 ***
## race Asian or Pacific Islander                     1.311 0.189971    
## race Black                                         0.172 0.863085    
## race Other                                         0.850 0.395281    
## race White                                         2.057 0.039679 *  
## marital_stat Married-A F spouse present           -0.925 0.355064    
## marital_stat Married-civilian spouse present       1.473 0.140667    
## marital_stat Married-spouse absent                -0.410 0.681517    
## marital_stat Never married                       -10.538  < 2e-16 ***
## marital_stat Separated                            -0.528 0.597572    
## marital_stat Widowed                               2.453 0.014159 *  
## education 11th grade                               1.422 0.155114    
## education 12th grade no diploma                    2.420 0.015519 *  
## education 1st 2nd 3rd or 4th grade                -1.091 0.275330    
## education 5th or 6th grade                        -1.872 0.061203 .  
## education 7th and 8th grade                       -0.647 0.517391    
## education 9th grade                               -0.375 0.707708    
## education Associates degree-academic program       9.977  < 2e-16 ***
## education Associates degree-occup /vocational      8.855  < 2e-16 ***
## education Bachelors degree(BA AB BS)              15.864  < 2e-16 ***
## education Children                                -0.156 0.876088    
## education Doctorate degree(PhD EdD)               22.013  < 2e-16 ***
## education High school graduate                     6.366 1.94e-10 ***
## education Less than 1st grade                     -1.715 0.086297 .  
## education Masters degree(MA MS MEng MEd MSW MBA)  19.959  < 2e-16 ***
## education Prof school degree (MD DDS DVM LLB JD)  19.994  < 2e-16 ***
## education Some college but no degree               9.369  < 2e-16 ***
## detailed_industry_recode1                          2.033 0.042101 *  
## detailed_industry_recode2                          2.603 0.009229 ** 
## detailed_industry_recode3                          5.648 1.62e-08 ***
## detailed_industry_recode4                          3.524 0.000424 ***
## detailed_industry_recode5                          2.565 0.010321 *  
## detailed_industry_recode6                          1.763 0.077835 .  
## detailed_industry_recode7                          3.231 0.001232 ** 
## detailed_industry_recode8                          4.569 4.90e-06 ***
## detailed_industry_recode9                          3.942 8.08e-05 ***
## detailed_industry_recode10                        -0.002 0.998531    
## detailed_industry_recode11                         4.404 1.06e-05 ***
## detailed_industry_recode12                         4.557 5.19e-06 ***
## detailed_industry_recode13                         6.468 9.90e-11 ***
## detailed_industry_recode14                         3.972 7.13e-05 ***
## detailed_industry_recode15                         4.266 1.99e-05 ***
## detailed_industry_recode16                         4.136 3.53e-05 ***
## detailed_industry_recode17                         1.779 0.075215 .  
## detailed_industry_recode18                         2.866 0.004155 ** 
## detailed_industry_recode19                         2.238 0.025209 *  
## detailed_industry_recode20                         5.869 4.38e-09 ***
## detailed_industry_recode21                         2.070 0.038422 *  
## detailed_industry_recode22                         2.456 0.014058 *  
## detailed_industry_recode23                         5.187 2.14e-07 ***
## detailed_industry_recode24                         3.524 0.000425 ***
## detailed_industry_recode25                         5.432 5.58e-08 ***
## detailed_industry_recode26                         6.622 3.54e-11 ***
## detailed_industry_recode27                         3.121 0.001804 ** 
## detailed_industry_recode28                         2.076 0.037899 *  
## detailed_industry_recode29                         4.442 8.92e-06 ***
## detailed_industry_recode30                         4.840 1.30e-06 ***
## detailed_industry_recode31                         5.032 4.86e-07 ***
## detailed_industry_recode32                         3.588 0.000333 ***
## detailed_industry_recode33                         2.085 0.037029 *  
## detailed_industry_recode34                         4.439 9.04e-06 ***
## detailed_industry_recode35                         3.618 0.000296 ***
## detailed_industry_recode36                         1.025 0.305242    
## detailed_industry_recode37                         3.989 6.63e-05 ***
## detailed_industry_recode38                         2.644 0.008193 ** 
## detailed_industry_recode39                         1.923 0.054480 .  
## detailed_industry_recode40                         2.815 0.004870 ** 
## detailed_industry_recode41                         2.275 0.022923 *  
## detailed_industry_recode42                         2.463 0.013764 *  
## detailed_industry_recode43                         1.320 0.186978    
## detailed_industry_recode44                         0.318 0.750210    
## detailed_industry_recode45                         3.566 0.000363 ***
## detailed_industry_recode46                         2.700 0.006931 ** 
## detailed_industry_recode47                         4.114 3.90e-05 ***
## detailed_industry_recode48                         1.262 0.206906    
## detailed_industry_recode49                         4.504 6.68e-06 ***
## detailed_industry_recode50                         3.502 0.000462 ***
## detailed_industry_recode51                         3.642 0.000271 ***
## detailed_occupation_recode1                       -2.165 0.030380 *  
## detailed_occupation_recode2                        0.011 0.991233    
## detailed_occupation_recode3                       -2.329 0.019879 *  
## detailed_occupation_recode4                       -0.863 0.388282    
## detailed_occupation_recode5                       -0.757 0.448921    
## detailed_occupation_recode6                       -2.729 0.006360 ** 
## detailed_occupation_recode7                        1.644 0.100241    
## detailed_occupation_recode8                       -0.614 0.539217    
## detailed_occupation_recode9                       -2.193 0.028312 *  
## detailed_occupation_recode10                      -3.080 0.002067 ** 
## detailed_occupation_recode11                       0.635 0.525352    
## detailed_occupation_recode12                      -3.487 0.000488 ***
## detailed_occupation_recode13                      -3.504 0.000458 ***
## detailed_occupation_recode14                      -3.721 0.000198 ***
## detailed_occupation_recode15                      -2.026 0.042755 *  
## detailed_occupation_recode16                      -0.799 0.424416    
## detailed_occupation_recode17                      -1.164 0.244466    
## detailed_occupation_recode18                      -1.353 0.176194    
## detailed_occupation_recode19                      -4.157 3.23e-05 ***
## detailed_occupation_recode20                      -1.581 0.113922    
## detailed_occupation_recode21                      -2.617 0.008858 ** 
## detailed_occupation_recode22                      -3.804 0.000142 ***
## detailed_occupation_recode23                      -5.881 4.08e-09 ***
## detailed_occupation_recode24                      -5.419 5.98e-08 ***
## detailed_occupation_recode25                      -5.402 6.59e-08 ***
## detailed_occupation_recode26                      -6.639 3.16e-11 ***
## detailed_occupation_recode27                      -2.619 0.008831 ** 
## detailed_occupation_recode28                      -2.890 0.003849 ** 
## detailed_occupation_recode29                      -6.447 1.14e-10 ***
## detailed_occupation_recode30                      -4.663 3.11e-06 ***
## detailed_occupation_recode31                      -7.223 5.10e-13 ***
## detailed_occupation_recode32                      -4.459 8.24e-06 ***
## detailed_occupation_recode33                      -3.743 0.000182 ***
## detailed_occupation_recode34                      -3.490 0.000483 ***
## detailed_occupation_recode35                      -4.044 5.25e-05 ***
## detailed_occupation_recode36                      -5.582 2.37e-08 ***
## detailed_occupation_recode37                      -5.792 6.97e-09 ***
## detailed_occupation_recode38                      -5.091 3.55e-07 ***
## detailed_occupation_recode39                      -3.861 0.000113 ***
## detailed_occupation_recode40                      -3.787 0.000153 ***
## detailed_occupation_recode41                      -5.976 2.29e-09 ***
## detailed_occupation_recode42                      -6.380 1.77e-10 ***
## detailed_occupation_recode43                      -2.769 0.005621 ** 
## detailed_occupation_recode44                      -4.832 1.35e-06 ***
## detailed_occupation_recode45                          NA       NA    
## detailed_occupation_recode46                          NA       NA    
## weeks_worked_in_year                              29.786  < 2e-16 ***
## num_persons_worked_for_employer                   21.200  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92815  on 199522  degrees of freedom
## Residual deviance: 53700  on 199397  degrees of freedom
## AIC: 53952
## 
## Number of Fisher Scoring iterations: 19
#model 2- predictors: age, sex, race, education, weeks worked in year
#this a reduced set of the predictors used for model 1 because I wanted to know if we could still do accurate predictions with fewers predictors  
model_2 = glm(income_level ~ age + sex + race + education + weeks_worked_in_year, data = census, family = binomial, na.action = na.omit) 

#get summary of model 2
summary(model_2)
## 
## Call:
## glm(formula = income_level ~ age + sex + race + education + weeks_worked_in_year, 
##     family = binomial, data = census, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3940  -0.2937  -0.1000  -0.0001   3.9842  
## 
## Coefficients:
##                                                    Estimate Std. Error
## (Intercept)                                      -9.647e+00  2.137e-01
## age                                               4.174e-02  8.692e-04
## sex Male                                          1.366e+00  2.450e-02
## race Asian or Pacific Islander                    2.626e-01  1.671e-01
## race Black                                       -3.995e-03  1.630e-01
## race Other                                        8.343e-02  1.974e-01
## race White                                        4.252e-01  1.563e-01
## education 11th grade                              1.990e-01  1.783e-01
## education 12th grade no diploma                   5.463e-01  2.192e-01
## education 1st 2nd 3rd or 4th grade               -4.841e-01  3.097e-01
## education 5th or 6th grade                       -6.153e-01  2.518e-01
## education 7th and 8th grade                      -1.104e-01  1.772e-01
## education 9th grade                              -1.181e-01  2.099e-01
## education Associates degree-academic program      1.971e+00  1.408e-01
## education Associates degree-occup /vocational     1.713e+00  1.404e-01
## education Bachelors degree(BA AB BS)              2.782e+00  1.313e-01
## education Children                               -1.144e+01  4.803e+01
## education Doctorate degree(PhD EdD)               3.949e+00  1.445e-01
## education High school graduate                    1.044e+00  1.320e-01
## education Less than 1st grade                    -2.037e+00  1.011e+00
## education Masters degree(MA MS MEng MEd MSW MBA)  3.305e+00  1.331e-01
## education Prof school degree (MD DDS DVM LLB JD)  4.091e+00  1.405e-01
## education Some college but no degree              1.626e+00  1.323e-01
## weeks_worked_in_year                              5.495e-02  8.876e-04
##                                                  z value Pr(>|z|)    
## (Intercept)                                      -45.137  < 2e-16 ***
## age                                               48.016  < 2e-16 ***
## sex Male                                          55.747  < 2e-16 ***
## race Asian or Pacific Islander                     1.571  0.11609    
## race Black                                        -0.025  0.98045    
## race Other                                         0.423  0.67261    
## race White                                         2.721  0.00651 ** 
## education 11th grade                               1.116  0.26438    
## education 12th grade no diploma                    2.492  0.01269 *  
## education 1st 2nd 3rd or 4th grade                -1.563  0.11807    
## education 5th or 6th grade                        -2.443  0.01456 *  
## education 7th and 8th grade                       -0.623  0.53331    
## education 9th grade                               -0.563  0.57355    
## education Associates degree-academic program      13.994  < 2e-16 ***
## education Associates degree-occup /vocational     12.201  < 2e-16 ***
## education Bachelors degree(BA AB BS)              21.192  < 2e-16 ***
## education Children                                -0.238  0.81172    
## education Doctorate degree(PhD EdD)               27.326  < 2e-16 ***
## education High school graduate                     7.913 2.52e-15 ***
## education Less than 1st grade                     -2.015  0.04387 *  
## education Masters degree(MA MS MEng MEd MSW MBA)  24.824  < 2e-16 ***
## education Prof school degree (MD DDS DVM LLB JD)  29.115  < 2e-16 ***
## education Some college but no degree              12.294  < 2e-16 ***
## weeks_worked_in_year                              61.903  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 92815  on 199522  degrees of freedom
## Residual deviance: 59148  on 199499  degrees of freedom
## AIC: 59196
## 
## Number of Fisher Scoring iterations: 18

Step 4: Choose the model that appears to have the highest performance based on a comparison between reality (the 42nd variable) and the model’s prediction.

##using model, predict outcome
#if prediction > 0.5, label as "over", else label as "under" $50K income level
probs_model1 = predict(model_1,  type='response')
pred_model1 = ifelse(probs_model1 > 0.5, "over", "under")

#calculate accuracy
accuracy_model1 = mean(pred_model1 == census[ , "income_level"])
accuracy_model1
## [1] 0.9485573
#same prediction and accuracy calculation for model 2
probs_model2 = predict(model_2,  type='response')
pred_model2 = ifelse(probs_model2 > 0.5, "over", "under")
accuracy_model2 = mean(pred_model2 == census[ , "income_level"])
accuracy_model2
## [1] 0.9426181
##model 1 is slightly more accurate but there are many more degrees of freedom so is it actually better?
#these models are nested so can do likelihood ratio test
lrtest(model_1, model_2)
## Likelihood ratio test
## 
## Model 1: income_level ~ age + sex + race + marital_stat + education + 
##     detailed_industry_recode + detailed_occupation_recode + weeks_worked_in_year + 
##     num_persons_worked_for_employer
## Model 2: income_level ~ age + sex + race + education + weeks_worked_in_year
##   #Df LogLik   Df  Chisq Pr(>Chisq)    
## 1 126 -26850                           
## 2  24 -29574 -102 5447.6  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#they are statistically significant despite difference in degrees of freedom so let's proceed with model 1 as best fit of the 2

##plot ROC curves to visualize and compare model sensitivity (true positive rate) and specificity (true negative rate) where "over" is positive

#format data as numeric for ROC curves
pred_model1_numeric = ifelse(pred_model1 == "under", 1, 2)
roc_model1 = roc(income_level ~ pred_model1_numeric, data = census)

pred_model2_numeric = ifelse(pred_model2 == "under", 1, 2)
roc_model2 = roc(income_level ~ pred_model2_numeric, data = census)

plot(roc_model1, col = colPal_all[1], main = "ROC curves (over = case, under = control)")
lines(roc_model2, col = colPal_all[4])
legend("topleft", c("Model 1", "Model 2"), col = colPal_all[c(1,4)], lty = 1)

Step 5: Apply your model to the test file and measure it’s real performance on it (same method as above).